#!/usr/bin/python

''' Set of functions to prepare text for the dynamic-nmf routines. '''

import os, os.path, sys, codecs
import logging as log
import text.util
import shutil

def _lookup_slice(doc, dir_map):
    ''' Takes a list of (text, slice) tuples and returns
        the correct time slice directory '''
    
    # Get documents time slice
    slice = doc[-1]
    
    # Map to the correct slice directory
    for dir in dir_map:
        if dir[1] == slice:
            return dir

def create_text_files(slice_data, data_dir, overwrite = False):
    ''' Uses the list of (text, slice) tuples to create individual text files
        and write them the correct time slice folder. '''
        
    # Get time slices:
    slices = sorted(list(set([row[-1] for row in slice_data])))
    
    # Make new directories to hold data
    dir_paths = []
    for i,slice in enumerate(slices):
        # Create a directory and save the id, slice, dir mapping
        dir_path = data_dir + '/time%02d' % i
        
        # Check if directory already exists?
        try:
            os.mkdir(dir_path)
        except:
            if overwrite:
                # Remove directory
                shutil.rmtree(dir_path)
                
                # Make directory for new data
                os.mkdir(dir_path)
            else:
                print("File already exists: %s" % dir_path)
                print("Use overwrite = True to ignore this warning.")
                return 1
            
        dir_paths.append((i, slice, dir_path))
    
    # Write text files (1 per document) to the appropriate
    # time slice directory
    doc2id = {}
    for d,doc in enumerate(slice_data):
        # Look up time slice info for a given document
        sliceid, slice, dir_path = _lookup_slice(doc, dir_paths)
        
        # Save the document id and the dynamic-nmf document
        # label for later usage
        docstub = 'doc%06d' % d
        doc2id[docstub] = doc[0]
        
        # Write the document to the correct time slice folder
        docpath = dir_path + '/%s.txt' % docstub
        with open(docpath, 'w') as txtfile:
            txtfile.write(doc[1])
    
    return dir_paths
    

def process_text(inpaths, min_df = 10, tfidf = True, norm = True, minlen = 0):
    ''' Takes a list of paths to the directories holding the individual text
        files and performs feature extraction.
    
        Inputs
        ------
        
        inpaths: List of paths to directories holding the input text
        min_df:  Minimum number of documents for a term to appear
        tfidf:   Apply TF-IDF term weight to the document-term matrix
        norm:    Apply unit length normalization to the document-term matrix
        minlen:  Minimum document length (in characters)
        
        Output
        ------
        Returns a list of dictonaries holding with,
        dir_name: the directory name (or ID)
        X:        matrix of TF-IDF weights
        terms:    a list of terms making up the vocabulary
        doc_ids:  document IDs
        '''
    
    # Process each directory
    processed = []
    for in_path in inpaths:
        dir_name = os.path.basename( in_path )
        print("Processing %s" % dir_name)
        
        # Read content of all documents in the directory
        docgen = text.util.DocumentBodyGenerator( [in_path], minlen )
        docs = []
        doc_ids = []
        for doc_id, body in docgen:
            docs.append(body)    
            doc_ids.append(doc_id)    
        print("Found %d documents to parse" % len(docs))

        # Pre-process the documents
        (X,terms) = text.util.preprocess( docs, min_df = min_df, apply_tfidf = tfidf, apply_norm = norm )
        print("Created %dx%d document-term matrix" % X.shape)

        # Save the pre-processed documents
        processed.append({'dir_name': dir_name,
                          'X': X,
                          'terms': terms,
                          'doc_ids': doc_ids})
    
    return processed
